Set the working dir
setwd("/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR")
Libraries
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
suppressPackageStartupMessages(library(VennDiagram))
suppressMessages(source("~/Git/UPSCb/src/R/volcanoPlot.R"))
Source some helper functions
source("~/Git/UPSCb/src/R/plot.multidensity.R")
source("~/Git/UPSCb/src/R/featureSelection.R")
Create a palette
pal <- brewer.pal(9,"Paired")
Register the default plot margin
mar <- par("mar")
Read the sample information
kg <- read.table("/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/analysis_Tom/batch2-unormalised-gene-expression_data.csv", header = TRUE)
load("/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/analysis_Tom/batch2_counts.rda")
summarise to genes
#tx2gene <- data.frame(TXID=rownames(tx$counts),
# GENEID=sub("\\.[0-9]+","",rownames(tx$counts)))
#gx <- summarizeToGene(tx,tx2gene=tx2gene)
#kg <- round(gx$counts)
Sanity check
stopifnot(all(colnames(kg) == samples$SciLifeID))
sel <- rowSums(kg) == 0
# str(sel)= logical vector
# sums the counts for one gene in each samples
# show TRUE or FALSE for each gene if the gene is never expressed (has 0 counts in every time samples)
sprintf("%s%% (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(kg),digits=1),
sum(sel),
nrow(kg))
## [1] "18.6% (6014) of 32310 genes are not expressed"
#in our experiment the number of genes and transcript are the same (one transcript for one gene)
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
plot(density(log10(rowMeans(kg))),col=pal[1],
main="gene mean raw counts distribution",
xlab="mean raw counts (log10)")
#density=likelihood, chance
# plot the likelihood that the mean of genes has this mean of raw counts
# give an idea of the read depth, here the majority of gene maps ~30 reads (1.5 log10)
cols <- sample(pal,nlevels(samples$SampleName),replace = TRUE)
plot.multidensity(mclapply(1:ncol(kg),function(k){log10(kg)[,k]},mc.cores=16L),
col=cols[as.integer(samples$User.ID)],
legend.x="topright",
legend=levels(samples$User.ID),
legend.col=cols,
legend.lwd=2,
main="samples raw counts distribution",
xlab="per gene raw counts (log10)")
plotMDS(kg, cex=0.7)
conditions1 <- factor(paste(samples$AZD,samples$Nutrition))
pch<-as.integer(factor(samples$Timepoint))+14
pc <- prcomp(t(kg))
percent <- round(summary(pc)$importance[2,]*100);percent
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15
## 49 23 13 7 3 2 1 0 0 0 0 0 0 0 0
## PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## PC31 PC32 PC33 PC34 PC35 PC36
## 0 0 0 0 0 0
factor(substr(sub(".*_","",samples$SciLifeID),1,1))
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2
## Levels: 2
plot(0,0,type="n",xlim=range(pc$x[,1]),ylim=range(pc$x[,2]))
text(pc$x[,1],pc$x[,2],cex=.7,substr(sub(".*_","",samples$SciLifeID),1,1),adj = c(1,-1))
Looking for conditions coming from different batches
samples[samples$AZD=="DMSO" & samples$Nutrition == "NP",]
## SciLifeID SampleName Timepoint Nutrition AZD
## 13 P11554_213 I6-7 6 NP DMSO
## 14 P11554_214 I6-8 6 NP DMSO
## 15 P11554_215 I6-9 6 NP DMSO
## 37 P11554_237 I24-7 24 NP DMSO
samples[samples$AZD=="DMSO" & samples$Nutrition == "NPS",]
## SciLifeID SampleName Timepoint Nutrition AZD
## 7 P11554_207 I6-1 6 NPS DMSO
## 8 P11554_208 I6-2 6 NPS DMSO
## 9 P11554_209 I6-3 6 NPS DMSO
## 31 P11554_231 J24-1 24 NPS DMSO
## 32 P11554_232 I24-2 24 NPS DMSO
## 33 P11554_233 I24-3 24 NPS DMSO
samples[samples$AZD=="DMSO" & samples$Nutrition == "PKS",]
## SciLifeID SampleName Timepoint Nutrition AZD
## 19 P11554_219 I6-13 6 PKS DMSO
## 20 P11554_220 I6-14 6 PKS DMSO
## 21 P11554_221 I6-15 6 PKS DMSO
samples[samples$AZD=="DMSO" & samples$Nutrition == "NS",]
## SciLifeID SampleName Timepoint Nutrition AZD
## 25 P11554_225 I6-19 6 NS DMSO
## 26 P11554_226 I6-20 6 NS DMSO
## 27 P11554_227 I6-21 6 NS DMSO
samples[samples$AZD=="AZD" & samples$Nutrition == "NP",]
## SciLifeID SampleName Timepoint Nutrition AZD
## 16 P11554_216 I6-10 6 NP AZD
## 17 P11554_217 I6-11 6 NP AZD
## 18 P11554_218 I6-12 6 NP AZD
samples[samples$AZD=="AZD" & samples$Nutrition == "NPS",]
## SciLifeID SampleName Timepoint Nutrition AZD
## 10 P11554_210 J6-4 6 NPS AZD
## 11 P11554_211 I6-5 6 NPS AZD
## 12 P11554_212 I6-6 6 NPS AZD
## 34 P11554_234 I24-4 24 NPS AZD
## 35 P11554_235 I24-5 24 NPS AZD
## 36 P11554_236 J24-6 24 NPS AZD
samples[samples$AZD=="AZD" & samples$Nutrition == "PKS",]
## SciLifeID SampleName Timepoint Nutrition AZD
## 22 P11554_222 I6-16 6 PKS AZD
## 23 P11554_223 I6-17 6 PKS AZD
## 24 P11554_224 I6-18 6 PKS AZD
samples[samples$AZD=="AZD" & samples$Nutrition == "NS",]
## SciLifeID SampleName Timepoint Nutrition AZD
## 28 P11554_228 I6-22 6 NS AZD
## 29 P11554_229 I6-23 6 NS AZD
## 30 P11554_230 I6-24 6 NS AZD
samples[samples$AZD=="0",]
## SciLifeID SampleName Timepoint Nutrition AZD
## 2 P11554_202 J0-7 0 T0 0
## 3 P11554_203 J0-23 0 T0 0
## 4 P11554_204 J0-5 0 T0 0
## 5 P11554_205 J0-21 0 T0 0
## 6 P11554_206 J0-11 0 T0 0
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(conditions1))],
pch=pch)
legend("bottomleft",pch=19,
col=pal[as.integer(levels(factor(as.integer(conditions1))))],
legend=levels(factor(conditions1)))
legend("topleft",pch=as.integer(levels(factor(pch))),
col="black",
legend=levels(factor(samples$Timepoint)))
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(factor(conditions1))],
pch=pch)
legend("topleft",pch=19,
col=pal[as.integer(levels(factor(as.integer(conditions1))))],
legend=levels(factor(conditions1)))
legend("topright",pch=as.integer(levels(factor(pch))),
col="black",
legend=levels(factor(samples$Timepoint)))
par(mar=c(5.1, 4.1, 4.1, 2.1))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
Create the dds object, without giving any prior on the design
#
conditions <- factor(paste(samples$Timepoint,".",samples$Nutrition,".",samples$AZD))
conditions <- relevel(conditions,ref = "0 . T0 . 0")
dds.kg <- DESeqDataSetFromMatrix(
countData = kg,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
Check the size factors (i.e. the sequencing library size effect)
dds.kg <- estimateSizeFactors(dds.kg)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
sizes.kg <- sizeFactors(dds.kg)
# names(sizes.kt) <- colnames(kt)
# not useful sizes.kt already have the right names
pander(sizes.kg)
| P11554_202 | P11554_203 | P11554_204 | P11554_205 | P11554_206 | P11554_207 |
|---|---|---|---|---|---|
| 0.6333 | 0.8213 | 0.6677 | 0.8274 | 0.9137 | 1.187 |
| P11554_208 | P11554_209 | P11554_210 | P11554_211 | P11554_212 | P11554_213 |
|---|---|---|---|---|---|
| 0.9961 | 0.8321 | 0.9085 | 1.342 | 1.074 | 0.9243 |
| P11554_214 | P11554_215 | P11554_216 | P11554_217 | P11554_218 | P11554_219 |
|---|---|---|---|---|---|
| 1.105 | 1.05 | 1.045 | 1.107 | 1.201 | 1.404 |
| P11554_220 | P11554_221 | P11554_222 | P11554_223 | P11554_224 | P11554_225 |
|---|---|---|---|---|---|
| 1.071 | 1.185 | 1.305 | 1.025 | 1.104 | 0.6884 |
| P11554_226 | P11554_227 | P11554_228 | P11554_229 | P11554_230 | P11554_231 |
|---|---|---|---|---|---|
| 1.019 | 1.06 | 1.237 | 0.7092 | 0.8619 | 0.8808 |
| P11554_232 | P11554_233 | P11554_234 | P11554_235 | P11554_236 | P11554_237 |
|---|---|---|---|---|---|
| 0.7324 | 0.7617 | 1.324 | 1.565 | 1.312 | 1.146 |
boxplot(sizes.kg, main="Sequencing libraries size factor")
abline(h=1,lty=2,col="gray")
# There is a small variation between the mediane and the value 1.0
system.time(vsd.kg <- varianceStabilizingTransformation(dds.kg, blind=TRUE))
## user system elapsed
## 15.808 8.800 12.609
vst_blind <- assay(vsd.kg)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_blind <- vst_blind - min(vst_blind)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_blind[rowSums(vst_blind)>0,]) #mean variance stabilized between 0.5 and 1
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
#VST validated: mean variance stabilized around 0.5
write.csv(vst_blind,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalizedToT0-data-vst-blind.csv")
#save(kg, vst_blind, file="vst_blind.rda")
save(kg, vst_blind, file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/vst_blind.rda")
Calculate vst aware with DESeq2 taking the date as a parameter
#
system.time(vsd2 <- varianceStabilizingTransformation(dds.kg, blind=FALSE))
## user system elapsed
## 28.656 10.964 24.931
vst_aware <- assay(vsd2)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_aware <- vst_aware - min(vst_aware)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_aware[rowSums(vst_aware)>0,]) #mean variance stabilized between 0 and 0.5
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
# vst_aware validated: mean variance is way lower when vst is calculated "aware of the date factor" compare to before when vst was calculated "blind"
# stabilized around 0.2 instead of 0.5 when vst was blind
write.csv(vst_aware,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalized-data-vst-aware.csv")
#save(kg, vst_aware, file="vst_aware.rda")
#load("vst_aware.rda")
Differential expression
dds.kg <- DESeq(dds.kg)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
which results
resultsNames(dds.kg)
## [1] "Intercept"
## [2] "condition_24...NP...DMSO_vs_0...T0...0"
## [3] "condition_24...NPS...AZD_vs_0...T0...0"
## [4] "condition_24...NPS...DMSO_vs_0...T0...0"
## [5] "condition_6...NP...AZD_vs_0...T0...0"
## [6] "condition_6...NP...DMSO_vs_0...T0...0"
## [7] "condition_6...NPS...AZD_vs_0...T0...0"
## [8] "condition_6...NPS...DMSO_vs_0...T0...0"
## [9] "condition_6...NS...AZD_vs_0...T0...0"
## [10] "condition_6...NS...DMSO_vs_0...T0...0"
## [11] "condition_6...PKS...AZD_vs_0...T0...0"
## [12] "condition_6...PKS...DMSO_vs_0...T0...0"
res <- results(dds.kg,name="condition_6...NS...DMSO_vs_0...T0...0")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
six_NS_DMSO_vs_T0 <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NS_DMSO_vs_T0-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NS_DMSO_vs_T0-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...NPS...DMSO_vs_0...T0...0")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
six_NPS_DMSO_vs_T0 <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NPS_DMSO_vs_T0-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NPS_DMSO_vs_T0-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...NS...AZD_vs_0...T0...0")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
six_NS_AZD_vs_T0 <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NS_AZD_vs_T0-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NS_AZD_vs_T0-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...NPS...AZD_vs_0...T0...0")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
six_NPS_AZD_vs_T0 <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NPS_AZD_vs_T0-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NPS_AZD_vs_T0-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...PKS...DMSO_vs_0...T0...0")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
six_PKS_DMSO_vs_T0 <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_PKS_DMSO_vs_T0-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_PKS_DMSO_vs_T0-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...NP...DMSO_vs_0...T0...0")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
six_NP_DMSO_vs_T0 <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NP_DMSO_vs_T0-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NP_DMSO_vs_T0-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...PKS...AZD_vs_0...T0...0")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
six_PKS_AZD_vs_T0 <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_PKS_AZD_vs_T0-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_PKS_AZD_vs_T0-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...NP...AZD_vs_0...T0...0")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
six_NP_AZD_vs_T0 <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NP_AZD_vs_T0-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/6_NP_AZD_vs_T0-DEgenes__significant.csv")
SixH <- VennDiagram::calculate.overlap(list(six_NS_DMSO_vs_T0, six_NS_AZD_vs_T0,six_NPS_AZD_vs_T0,six_NPS_DMSO_vs_T0))
grid.newpage()
svg("Venn1.svg", width = 7, height = 7)
grid.draw(venn.diagram(list(six_NS_DMSO_vs_T0, six_NS_AZD_vs_T0,six_NPS_AZD_vs_T0,six_NPS_DMSO_vs_T0),
filename=NULL,
col=pal[1:4],
category.names=c("PStarv","PStarv_AZD","AZD","NoStarv")))
dev.off()
## png
## 2
grid.draw(venn.diagram(list(six_NS_DMSO_vs_T0, six_NS_AZD_vs_T0,six_NPS_AZD_vs_T0,six_NPS_DMSO_vs_T0),
filename=NULL,
col=pal[1:4],
category.names=c("PStarv","PStarv_AZD","AZD","NoStarv")))
SixH <- VennDiagram::calculate.overlap(list(six_NS_DMSO_vs_T0,
six_NPS_DMSO_vs_T0,
six_PKS_DMSO_vs_T0,
six_NP_DMSO_vs_T0))
grid.newpage()
svg("Venn2.svg", width = 7, height = 7)
grid.draw(venn.diagram(list(six_NS_DMSO_vs_T0,
six_NPS_DMSO_vs_T0,
six_PKS_DMSO_vs_T0,
six_NP_DMSO_vs_T0),
filename=NULL,
col=pal[1:4],
category.names=c("Pstarv","full","Nstarv","Sstarv")))
dev.off()
## png
## 2
grid.draw(venn.diagram(list(six_NS_DMSO_vs_T0,
six_NPS_DMSO_vs_T0,
six_PKS_DMSO_vs_T0,
six_NP_DMSO_vs_T0),
filename=NULL,
col=pal[1:4],
category.names=c("Pstarv","full","Nstarv","Sstarv")))
SixH <- VennDiagram::calculate.overlap(list(six_NS_AZD_vs_T0,
six_NPS_AZD_vs_T0,
six_PKS_AZD_vs_T0,
six_NP_AZD_vs_T0))
grid.newpage()
svg("Venn3.svg", width = 7, height = 7)
grid.draw(venn.diagram(list(six_NS_AZD_vs_T0,
six_NPS_AZD_vs_T0,
six_PKS_AZD_vs_T0,
six_NP_AZD_vs_T0),
filename=NULL,
col=pal[1:4],
category.names=c("Pstarv & AZD","full & AZD","Nstarv & AZD","Sstarv & AZD")))
dev.off()
## png
## 2
grid.draw(venn.diagram(list(six_NS_AZD_vs_T0,
six_NPS_AZD_vs_T0,
six_PKS_AZD_vs_T0,
six_NP_AZD_vs_T0),
filename=NULL,
col=pal[1:4],
category.names=c("Pstarv & AZD","full & AZD","Nstarv & AZD","Sstarv & AZD")))
conditions <- factor(paste(samples$Timepoint,".",samples$Nutrition,".",samples$AZD))
conditions <- relevel(conditions,ref = "6 . NPS . DMSO")
dds.kg <- DESeqDataSetFromMatrix(
countData = kg,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
Check the size factors (i.e. the sequencing library size effect)
dds.kg <- estimateSizeFactors(dds.kg)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
sizes.kg <- sizeFactors(dds.kg)
# names(sizes.kt) <- colnames(kt)
# not useful sizes.kt already have the right names
pander(sizes.kg)
| P11554_202 | P11554_203 | P11554_204 | P11554_205 | P11554_206 | P11554_207 |
|---|---|---|---|---|---|
| 0.6333 | 0.8213 | 0.6677 | 0.8274 | 0.9137 | 1.187 |
| P11554_208 | P11554_209 | P11554_210 | P11554_211 | P11554_212 | P11554_213 |
|---|---|---|---|---|---|
| 0.9961 | 0.8321 | 0.9085 | 1.342 | 1.074 | 0.9243 |
| P11554_214 | P11554_215 | P11554_216 | P11554_217 | P11554_218 | P11554_219 |
|---|---|---|---|---|---|
| 1.105 | 1.05 | 1.045 | 1.107 | 1.201 | 1.404 |
| P11554_220 | P11554_221 | P11554_222 | P11554_223 | P11554_224 | P11554_225 |
|---|---|---|---|---|---|
| 1.071 | 1.185 | 1.305 | 1.025 | 1.104 | 0.6884 |
| P11554_226 | P11554_227 | P11554_228 | P11554_229 | P11554_230 | P11554_231 |
|---|---|---|---|---|---|
| 1.019 | 1.06 | 1.237 | 0.7092 | 0.8619 | 0.8808 |
| P11554_232 | P11554_233 | P11554_234 | P11554_235 | P11554_236 | P11554_237 |
|---|---|---|---|---|---|
| 0.7324 | 0.7617 | 1.324 | 1.565 | 1.312 | 1.146 |
boxplot(sizes.kg, main="Sequencing libraries size factor")
abline(h=1,lty=2,col="gray")
# There is a small variation between the mediane and the value 1.0
system.time(vsd.kg <- varianceStabilizingTransformation(dds.kg, blind=TRUE))
## user system elapsed
## 14.188 7.976 11.199
vst_blind <- assay(vsd.kg)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_blind <- vst_blind - min(vst_blind)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_blind[rowSums(vst_blind)>0,]) #mean variance stabilized between 0.5 and 1
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
#VST validated: mean variance stabilized around 0.5
write.csv(vst_blind,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalizedTo6_NPS_DMSO-data-vst-blind.csv")
#save(kg, vst_blind, file="vst_blind.rda")
save(kg, vst_blind, file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/vst_blind.rda")
Calculate vst aware with DESeq2 taking the date as a parameter
#
system.time(vsd2 <- varianceStabilizingTransformation(dds.kg, blind=FALSE))
## user system elapsed
## 26.896 8.736 24.066
vst_aware <- assay(vsd2)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_aware <- vst_aware - min(vst_aware)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_aware[rowSums(vst_aware)>0,]) #mean variance stabilized between 0 and 0.5
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
# vst_aware validated: mean variance is way lower when vst is calculated "aware of the date factor" compare to before when vst was calculated "blind"
# stabilized around 0.2 instead of 0.5 when vst was blind
write.csv(vst_aware,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalizedTo_6_NPS_DMSO-data-vst-aware.csv")
#save(kg, vst_aware, file="vst_aware.rda")
#load("vst_aware.rda")
Differential expression
dds.kg <- DESeq(dds.kg)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
which results
resultsNames(dds.kg)
## [1] "Intercept"
## [2] "condition_0...T0...0_vs_6...NPS...DMSO"
## [3] "condition_24...NP...DMSO_vs_6...NPS...DMSO"
## [4] "condition_24...NPS...AZD_vs_6...NPS...DMSO"
## [5] "condition_24...NPS...DMSO_vs_6...NPS...DMSO"
## [6] "condition_6...NP...AZD_vs_6...NPS...DMSO"
## [7] "condition_6...NP...DMSO_vs_6...NPS...DMSO"
## [8] "condition_6...NPS...AZD_vs_6...NPS...DMSO"
## [9] "condition_6...NS...AZD_vs_6...NPS...DMSO"
## [10] "condition_6...NS...DMSO_vs_6...NPS...DMSO"
## [11] "condition_6...PKS...AZD_vs_6...NPS...DMSO"
## [12] "condition_6...PKS...DMSO_vs_6...NPS...DMSO"
res <- results(dds.kg,name="condition_6...NPS...AZD_vs_6...NPS...DMSO")
alpha=0.01
plotMA(res)
#
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
AZDvsDMSO_in_NPS <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/AZDvsDMSO_in_NPS-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/AZDvsDMSO_in_NPS-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...NS...DMSO_vs_6...NPS...DMSO")
alpha=0.01
plotMA(res)
#
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
NSvsNPS_in_DMSO <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/NSvsNPS_in_DMSO-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/NSvsNPS_in_DMSO-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...PKS...DMSO_vs_6...NPS...DMSO")
alpha=0.01
plotMA(res)
#
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
PKSvsNPS_in_DMSO <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/PKSvsNPS_in_DMSO-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/APKSvsNPS_in_DMSO-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...NP...DMSO_vs_6...NPS...DMSO")
alpha=0.01
plotMA(res)
#
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
NPvsNPS_in_DMSO <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/NPvsNPS_in_DMSO-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/NPvsNPS_in_DMSO-DEgenes__significant.csv")
res <- results(dds.kg,name="condition_6...NS...AZD_vs_6...NPS...DMSO")
alpha=0.01
plotMA(res)
#
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
NS.AZD_vs_NPS.DMSO <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/NS.AZD_vs_NPS.DMSO-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/NS.AZD_vs_NPS.DMSO-DEgenes__significant.csv")
SixH <- VennDiagram::calculate.overlap(list(NSvsNPS_in_DMSO,
AZDvsDMSO_in_NPS,
NS.AZD_vs_NPS.DMSO))
grid.newpage()
svg("Venn5.svg", width = 7, height = 7)
grid.draw(venn.diagram(list(NSvsNPS_in_DMSO,
AZDvsDMSO_in_NPS,
NS.AZD_vs_NPS.DMSO),
filename=NULL,
col=pal[1:3],
category.names=c("Pstarv","AZD","Pstarv & AZD")))
dev.off()
## png
## 2
grid.draw(venn.diagram(list(NSvsNPS_in_DMSO,
AZDvsDMSO_in_NPS,
NS.AZD_vs_NPS.DMSO),
filename=NULL,
col=pal[1:3],
category.names=c("Pstarv","AZD","Pstarv & AZD")))
conditions <- factor(paste(samples$Timepoint,".",samples$Nutrition,".",samples$AZD))
conditions <- relevel(conditions,ref = "6 . PKS . DMSO")
dds.kg <- DESeqDataSetFromMatrix(
countData = kg,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
Check the size factors (i.e. the sequencing library size effect)
dds.kg <- estimateSizeFactors(dds.kg)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
sizes.kg <- sizeFactors(dds.kg)
# names(sizes.kt) <- colnames(kt)
# not useful sizes.kt already have the right names
pander(sizes.kg)
| P11554_202 | P11554_203 | P11554_204 | P11554_205 | P11554_206 | P11554_207 |
|---|---|---|---|---|---|
| 0.6333 | 0.8213 | 0.6677 | 0.8274 | 0.9137 | 1.187 |
| P11554_208 | P11554_209 | P11554_210 | P11554_211 | P11554_212 | P11554_213 |
|---|---|---|---|---|---|
| 0.9961 | 0.8321 | 0.9085 | 1.342 | 1.074 | 0.9243 |
| P11554_214 | P11554_215 | P11554_216 | P11554_217 | P11554_218 | P11554_219 |
|---|---|---|---|---|---|
| 1.105 | 1.05 | 1.045 | 1.107 | 1.201 | 1.404 |
| P11554_220 | P11554_221 | P11554_222 | P11554_223 | P11554_224 | P11554_225 |
|---|---|---|---|---|---|
| 1.071 | 1.185 | 1.305 | 1.025 | 1.104 | 0.6884 |
| P11554_226 | P11554_227 | P11554_228 | P11554_229 | P11554_230 | P11554_231 |
|---|---|---|---|---|---|
| 1.019 | 1.06 | 1.237 | 0.7092 | 0.8619 | 0.8808 |
| P11554_232 | P11554_233 | P11554_234 | P11554_235 | P11554_236 | P11554_237 |
|---|---|---|---|---|---|
| 0.7324 | 0.7617 | 1.324 | 1.565 | 1.312 | 1.146 |
boxplot(sizes.kg, main="Sequencing libraries size factor")
abline(h=1,lty=2,col="gray")
# There is a small variation between the mediane and the value 1.0
system.time(vsd.kg <- varianceStabilizingTransformation(dds.kg, blind=TRUE))
## user system elapsed
## 14.300 8.144 11.165
vst_blind <- assay(vsd.kg)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_blind <- vst_blind - min(vst_blind)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_blind[rowSums(vst_blind)>0,]) #mean variance stabilized between 0.5 and 1
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
#VST validated: mean variance stabilized around 0.5
write.csv(vst_blind,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalizedTo6_PKS_DMSO-data-vst-blind.csv")
#save(kg, vst_blind, file="vst_blind.rda")
save(kg, vst_blind, file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/vst_blind.rda")
Calculate vst aware with DESeq2 taking the date as a parameter
#
system.time(vsd2 <- varianceStabilizingTransformation(dds.kg, blind=FALSE))
## user system elapsed
## 26.684 8.348 23.804
vst_aware <- assay(vsd2)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_aware <- vst_aware - min(vst_aware)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_aware[rowSums(vst_aware)>0,]) #mean variance stabilized between 0 and 0.5
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
# vst_aware validated: mean variance is way lower when vst is calculated "aware of the date factor" compare to before when vst was calculated "blind"
# stabilized around 0.2 instead of 0.5 when vst was blind
write.csv(vst_aware,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalizedTo_6_PKS_DMSO-data-vst-aware.csv")
#save(kg, vst_aware, file="vst_aware.rda")
#load("vst_aware.rda")
Differential expression
dds.kg <- DESeq(dds.kg)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
which results
resultsNames(dds.kg)
## [1] "Intercept"
## [2] "condition_0...T0...0_vs_6...PKS...DMSO"
## [3] "condition_24...NP...DMSO_vs_6...PKS...DMSO"
## [4] "condition_24...NPS...AZD_vs_6...PKS...DMSO"
## [5] "condition_24...NPS...DMSO_vs_6...PKS...DMSO"
## [6] "condition_6...NP...AZD_vs_6...PKS...DMSO"
## [7] "condition_6...NP...DMSO_vs_6...PKS...DMSO"
## [8] "condition_6...NPS...AZD_vs_6...PKS...DMSO"
## [9] "condition_6...NPS...DMSO_vs_6...PKS...DMSO"
## [10] "condition_6...NS...AZD_vs_6...PKS...DMSO"
## [11] "condition_6...NS...DMSO_vs_6...PKS...DMSO"
## [12] "condition_6...PKS...AZD_vs_6...PKS...DMSO"
res <- results(dds.kg,name="condition_6...PKS...AZD_vs_6...PKS...DMSO")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
AZDvsDMSO_in_PKS <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/AZDvsDMSO_in_PKS-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/AZDvsDMSO_in_PKS-DEgenes__significant.csv")
conditions <- factor(paste(samples$Timepoint,".",samples$Nutrition,".",samples$AZD))
conditions <- relevel(conditions,ref = "6 . NS . DMSO")
dds.kg <- DESeqDataSetFromMatrix(
countData = kg,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
Check the size factors (i.e. the sequencing library size effect)
dds.kg <- estimateSizeFactors(dds.kg)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
sizes.kg <- sizeFactors(dds.kg)
# names(sizes.kt) <- colnames(kt)
# not useful sizes.kt already have the right names
pander(sizes.kg)
| P11554_202 | P11554_203 | P11554_204 | P11554_205 | P11554_206 | P11554_207 |
|---|---|---|---|---|---|
| 0.6333 | 0.8213 | 0.6677 | 0.8274 | 0.9137 | 1.187 |
| P11554_208 | P11554_209 | P11554_210 | P11554_211 | P11554_212 | P11554_213 |
|---|---|---|---|---|---|
| 0.9961 | 0.8321 | 0.9085 | 1.342 | 1.074 | 0.9243 |
| P11554_214 | P11554_215 | P11554_216 | P11554_217 | P11554_218 | P11554_219 |
|---|---|---|---|---|---|
| 1.105 | 1.05 | 1.045 | 1.107 | 1.201 | 1.404 |
| P11554_220 | P11554_221 | P11554_222 | P11554_223 | P11554_224 | P11554_225 |
|---|---|---|---|---|---|
| 1.071 | 1.185 | 1.305 | 1.025 | 1.104 | 0.6884 |
| P11554_226 | P11554_227 | P11554_228 | P11554_229 | P11554_230 | P11554_231 |
|---|---|---|---|---|---|
| 1.019 | 1.06 | 1.237 | 0.7092 | 0.8619 | 0.8808 |
| P11554_232 | P11554_233 | P11554_234 | P11554_235 | P11554_236 | P11554_237 |
|---|---|---|---|---|---|
| 0.7324 | 0.7617 | 1.324 | 1.565 | 1.312 | 1.146 |
boxplot(sizes.kg, main="Sequencing libraries size factor")
abline(h=1,lty=2,col="gray")
# There is a small variation between the mediane and the value 1.0
system.time(vsd.kg <- varianceStabilizingTransformation(dds.kg, blind=TRUE))
## user system elapsed
## 14.100 8.480 11.007
vst_blind <- assay(vsd.kg)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_blind <- vst_blind - min(vst_blind)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_blind[rowSums(vst_blind)>0,]) #mean variance stabilized between 0.5 and 1
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
#VST validated: mean variance stabilized around 0.5
write.csv(vst_blind,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalizedTo6_NS_DMSO-data-vst-blind.csv")
#save(kg, vst_blind, file="vst_blind.rda")
save(kg, vst_blind, file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/vst_blind.rda")
Calculate vst aware with DESeq2 taking the date as a parameter #### Model-aware
system.time(vsd2 <- varianceStabilizingTransformation(dds.kg, blind=FALSE))
## user system elapsed
## 27.160 9.396 24.056
vst_aware <- assay(vsd2)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_aware <- vst_aware - min(vst_aware)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_aware[rowSums(vst_aware)>0,]) #mean variance stabilized between 0 and 0.5
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
# vst_aware validated: mean variance is way lower when vst is calculated "aware of the date factor" compare to before when vst was calculated "blind"
# stabilized around 0.2 instead of 0.5 when vst was blind
write.csv(vst_aware,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalizedTo_6_NS_DMSO-data-vst-aware.csv")
#save(kg, vst_aware, file="vst_aware.rda")
#load("vst_aware.rda")
Differential expression
dds.kg <- DESeq(dds.kg)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
which results
resultsNames(dds.kg)
## [1] "Intercept"
## [2] "condition_0...T0...0_vs_6...NS...DMSO"
## [3] "condition_24...NP...DMSO_vs_6...NS...DMSO"
## [4] "condition_24...NPS...AZD_vs_6...NS...DMSO"
## [5] "condition_24...NPS...DMSO_vs_6...NS...DMSO"
## [6] "condition_6...NP...AZD_vs_6...NS...DMSO"
## [7] "condition_6...NP...DMSO_vs_6...NS...DMSO"
## [8] "condition_6...NPS...AZD_vs_6...NS...DMSO"
## [9] "condition_6...NPS...DMSO_vs_6...NS...DMSO"
## [10] "condition_6...NS...AZD_vs_6...NS...DMSO"
## [11] "condition_6...PKS...AZD_vs_6...NS...DMSO"
## [12] "condition_6...PKS...DMSO_vs_6...NS...DMSO"
res <- results(dds.kg,name="condition_6...NS...AZD_vs_6...NS...DMSO")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
AZDvsDMSO_in_NS <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/AZDvsDMSO_in_NS-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/AZDvsDMSO_in_NS-DEgenes__significant.csv")
conditions <- factor(paste(samples$Timepoint,".",samples$Nutrition,".",samples$AZD))
conditions <- relevel(conditions,ref = "6 . NP . DMSO")
dds.kg <- DESeqDataSetFromMatrix(
countData = kg,
colData = data.frame(condition=conditions),
design = ~ condition)
## converting counts to integer mode
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
Check the size factors (i.e. the sequencing library size effect)
dds.kg <- estimateSizeFactors(dds.kg)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
sizes.kg <- sizeFactors(dds.kg)
# names(sizes.kt) <- colnames(kt)
# not useful sizes.kt already have the right names
pander(sizes.kg)
| P11554_202 | P11554_203 | P11554_204 | P11554_205 | P11554_206 | P11554_207 |
|---|---|---|---|---|---|
| 0.6333 | 0.8213 | 0.6677 | 0.8274 | 0.9137 | 1.187 |
| P11554_208 | P11554_209 | P11554_210 | P11554_211 | P11554_212 | P11554_213 |
|---|---|---|---|---|---|
| 0.9961 | 0.8321 | 0.9085 | 1.342 | 1.074 | 0.9243 |
| P11554_214 | P11554_215 | P11554_216 | P11554_217 | P11554_218 | P11554_219 |
|---|---|---|---|---|---|
| 1.105 | 1.05 | 1.045 | 1.107 | 1.201 | 1.404 |
| P11554_220 | P11554_221 | P11554_222 | P11554_223 | P11554_224 | P11554_225 |
|---|---|---|---|---|---|
| 1.071 | 1.185 | 1.305 | 1.025 | 1.104 | 0.6884 |
| P11554_226 | P11554_227 | P11554_228 | P11554_229 | P11554_230 | P11554_231 |
|---|---|---|---|---|---|
| 1.019 | 1.06 | 1.237 | 0.7092 | 0.8619 | 0.8808 |
| P11554_232 | P11554_233 | P11554_234 | P11554_235 | P11554_236 | P11554_237 |
|---|---|---|---|---|---|
| 0.7324 | 0.7617 | 1.324 | 1.565 | 1.312 | 1.146 |
boxplot(sizes.kg, main="Sequencing libraries size factor")
abline(h=1,lty=2,col="gray")
# There is a small variation between the mediane and the value 1.0
system.time(vsd.kg <- varianceStabilizingTransformation(dds.kg, blind=TRUE))
## user system elapsed
## 14.056 8.096 11.040
vst_blind <- assay(vsd.kg)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_blind <- vst_blind - min(vst_blind)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_blind[rowSums(vst_blind)>0,]) #mean variance stabilized between 0.5 and 1
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
#VST validated: mean variance stabilized around 0.5
write.csv(vst_blind,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalizedTo6_NP_DMSO-data-vst-blind.csv")
#save(kg, vst_blind, file="vst_blind.rda")
save(kg, vst_blind, file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/vst_blind.rda")
Calculate vst aware with DESeq2 taking the date as a parameter
#
system.time(vsd2 <- varianceStabilizingTransformation(dds.kg, blind=FALSE))
## user system elapsed
## 26.824 8.372 23.707
vst_aware <- assay(vsd2)
#colnames(vst.kt) <- colnames(kt)
#already have the right names
vst_aware <- vst_aware - min(vst_aware)
# substracts from all values the smaller value, so the smaller value is now 0
Validate the VST
meanSdPlot(vst_aware[rowSums(vst_aware)>0,]) #mean variance stabilized between 0 and 0.5
meanSdPlot(log2(kg+1)) #without Variance Stabilizing Transformation: variation around 1
meanSdPlot(log2(counts(dds.kg,normalized=TRUE)+1)) #normalized read depth: variance is not stabilized ("curvy")
# vst_aware validated: mean variance is way lower when vst is calculated "aware of the date factor" compare to before when vst was calculated "blind"
# stabilized around 0.2 instead of 0.5 when vst was blind
write.csv(vst_aware,"/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/Tom-normalizedTo_6_NP_DMSO-data-vst-aware.csv")
#save(kg, vst_aware, file="vst_aware.rda")
#load("vst_aware.rda")
Differential expression
dds.kg <- DESeq(dds.kg)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
which results
resultsNames(dds.kg)
## [1] "Intercept"
## [2] "condition_0...T0...0_vs_6...NP...DMSO"
## [3] "condition_24...NP...DMSO_vs_6...NP...DMSO"
## [4] "condition_24...NPS...AZD_vs_6...NP...DMSO"
## [5] "condition_24...NPS...DMSO_vs_6...NP...DMSO"
## [6] "condition_6...NP...AZD_vs_6...NP...DMSO"
## [7] "condition_6...NPS...AZD_vs_6...NP...DMSO"
## [8] "condition_6...NPS...DMSO_vs_6...NP...DMSO"
## [9] "condition_6...NS...AZD_vs_6...NP...DMSO"
## [10] "condition_6...NS...DMSO_vs_6...NP...DMSO"
## [11] "condition_6...PKS...AZD_vs_6...NP...DMSO"
## [12] "condition_6...PKS...DMSO_vs_6...NP...DMSO"
res <- results(dds.kg,name="condition_6...NP...AZD_vs_6...NP...DMSO")
alpha=0.01
plotMA(res)
The volcano plot is very flat, cross-validating the results of the MA plot
volcanoPlot(res,alpha=alpha)
Which is as expected, over-enriched for adjusted p-values of 1
hist(res$padj,breaks=seq(0,1,.01))
# select genes that are significant
alpha=0.01
l2fc = 0.5
sel <- ! is.na(res$padj) & res$padj <= alpha & abs(res$log2FoldChange) >= l2fc
# list of DE genes that are significant
AZDvsDMSO_in_NP <- rownames(res[sel,])
write.csv(res,file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/AZDvsDMSO_in_NP-DEgenes_all.csv")
write.csv(res[sel,],file="/mnt/picea/projects/arabidopsis/jhanson/arabidopsis-nutrition-TOR/AZDvsDMSO_in_NP-DEgenes__significant.csv")
SixH <- VennDiagram::calculate.overlap(list(AZDvsDMSO_in_NPS,
AZDvsDMSO_in_PKS,
AZDvsDMSO_in_NS,
AZDvsDMSO_in_NP))
grid.newpage()
svg("Venn4.svg", width = 7, height = 7)
grid.draw(venn.diagram(list(AZDvsDMSO_in_NPS,
AZDvsDMSO_in_PKS,
AZDvsDMSO_in_NS,
AZDvsDMSO_in_NP),
filename=NULL,
col=pal[1:4],
category.names=c("AZD in NPS","AZD in PKS","AZD in NS","AZD in NP")))
dev.off()
## png
## 2
grid.draw(venn.diagram(list(AZDvsDMSO_in_NPS,
AZDvsDMSO_in_PKS,
AZDvsDMSO_in_NS,
AZDvsDMSO_in_NP),
filename=NULL,
col=pal[1:4],
category.names=c("AZD in NPS","AZD in PKS","AZD in NS","AZD in NP")))
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] hexbin_1.27.2 limma_3.38.3
## [3] VennDiagram_1.6.20 futile.logger_1.4.3
## [5] vsn_3.50.0 tximport_1.11.7
## [7] scatterplot3d_0.3-41 RColorBrewer_1.1-2
## [9] pander_0.6.3 LSD_4.0-0
## [11] gplots_3.0.1.1 DESeq2_1.22.2
## [13] SummarizedExperiment_1.12.0 DelayedArray_0.8.0
## [15] BiocParallel_1.16.6 matrixStats_0.54.0
## [17] Biobase_2.42.0 GenomicRanges_1.34.0
## [19] GenomeInfoDb_1.18.2 IRanges_2.16.0
## [21] S4Vectors_0.20.1 BiocGenerics_0.28.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_0.9-7 tools_3.5.1
## [4] backports_1.1.4 R6_2.4.0 affyio_1.52.0
## [7] rpart_4.1-15 KernSmooth_2.23-15 Hmisc_4.2-0
## [10] DBI_1.0.0 lazyeval_0.2.2 colorspace_1.4-1
## [13] nnet_7.3-12 tidyselect_0.2.5 gridExtra_2.3
## [16] bit_1.1-14 compiler_3.5.1 preprocessCore_1.44.0
## [19] formatR_1.6 htmlTable_1.13.1 labeling_0.3
## [22] caTools_1.17.1.2 scales_1.0.0 checkmate_1.9.1
## [25] genefilter_1.64.0 affy_1.60.0 stringr_1.4.0
## [28] digest_0.6.18 foreign_0.8-71 rmarkdown_1.12
## [31] XVector_0.22.0 base64enc_0.1-3 pkgconfig_2.0.2
## [34] htmltools_0.3.6 highr_0.8 htmlwidgets_1.3
## [37] rlang_0.3.4 rstudioapi_0.10 RSQLite_2.1.1
## [40] gtools_3.8.1 acepack_1.4.1 dplyr_0.8.0.1
## [43] RCurl_1.95-4.12 magrittr_1.5 GenomeInfoDbData_1.2.0
## [46] Formula_1.2-3 Matrix_1.2-17 Rcpp_1.0.1
## [49] munsell_0.5.0 stringi_1.4.3 yaml_2.2.0
## [52] zlibbioc_1.28.0 plyr_1.8.4 blob_1.1.1
## [55] gdata_2.18.0 crayon_1.3.4 lattice_0.20-38
## [58] splines_3.5.1 annotate_1.60.1 locfit_1.5-9.1
## [61] knitr_1.22 pillar_1.3.1 geneplotter_1.60.0
## [64] futile.options_1.0.1 XML_3.98-1.19 glue_1.3.1
## [67] evaluate_0.13 latticeExtra_0.6-28 lambda.r_1.2.3
## [70] data.table_1.12.2 BiocManager_1.30.4 gtable_0.3.0
## [73] purrr_0.3.2 assertthat_0.2.1 ggplot2_3.1.1
## [76] xfun_0.6 xtable_1.8-4 survival_2.44-1.1
## [79] tibble_2.1.1 AnnotationDbi_1.44.0 memoise_1.1.0
## [82] cluster_2.0.8